Analysis

Data Exploration

In [1]:
# load dataset from csv file
# load python modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display # Allows the use of display() for DataFrames
#import seaborn as sns
%matplotlib inline

def load_data(file_name):

    # Load the wholesale dataset
    dataset_file = file_name
    try:
        data = pd.read_csv(dataset_file, delimiter=",")

        # temporary drop column 'status' due to type string
        status_cl = pd.DataFrame(data['STATUS'], columns = ['STATUS'], dtype=np.str).reset_index(drop=True)
        data = data.drop(['STATUS'], 1)

        # apply rest of columns converting to float type
        data = data.apply(pd.to_numeric, args=('coerce',)).reset_index(drop=True)


        data = pd.concat([data, status_cl], axis=1)
        data = data.reset_index(drop=True)
        print data.dtypes
        print "Wholesale flight dataset has {} samples with {} features each.".format(*data.shape)
    except:
        print "Dataset could not be loaded!"
    return data
print "Loading training data..."
learn_data = load_data(r'raw_data.csv').reset_index(drop=True)
print "\nLoading testing data..."
test_data = load_data(r'test_data.csv').reset_index(drop=True)
all_data = pd.concat([learn_data, test_data], axis=0).reset_index(drop=True)
print all_data.columns
Loading training data...
BAR            float64
MOTOR1           int64
MOTOR2           int64
MOTOR3           int64
MOTOR4           int64
LIDAR          float64
THRUST         float64
ACC_X          float64
ACC_Y          float64
ACC_Z          float64
MAG_Z          float64
GYRO_X         float64
GYRO_Y         float64
GYRO_Z         float64
PITCH          float64
ROLL           float64
YAW            float64
TIME             int64
LOG_ID           int64
LAND_STATUS      int64
STATUS          object
dtype: object
Wholesale flight dataset has 41147 samples with 21 features each.

Loading testing data...
BAR            float64
MOTOR1           int64
MOTOR2           int64
MOTOR3           int64
MOTOR4           int64
LIDAR          float64
THRUST         float64
ACC_X          float64
ACC_Y          float64
ACC_Z          float64
MAG_Z          float64
GYRO_X         float64
GYRO_Y         float64
GYRO_Z         float64
PITCH          float64
ROLL           float64
YAW            float64
TIME             int64
LOG_ID           int64
LAND_STATUS      int64
STATUS          object
dtype: object
Wholesale flight dataset has 46937 samples with 21 features each.
Index([u'BAR', u'MOTOR1', u'MOTOR2', u'MOTOR3', u'MOTOR4', u'LIDAR', u'THRUST',
       u'ACC_X', u'ACC_Y', u'ACC_Z', u'MAG_Z', u'GYRO_X', u'GYRO_Y', u'GYRO_Z',
       u'PITCH', u'ROLL', u'YAW', u'TIME', u'LOG_ID', u'LAND_STATUS',
       u'STATUS'],
      dtype='object')
In [2]:
# samples of dataset
from IPython.display import display # Allows the use of display() for DataFrames
indices = [1,100, 200, 1073,9999, 32050]
samples = pd.DataFrame(np.round(all_data.loc[indices], 3), columns = all_data.keys()[0:10])
display(samples)
samples = pd.DataFrame(np.round(all_data.loc[indices], 3), columns = all_data.keys()[10:])
display(samples)
BAR MOTOR1 MOTOR2 MOTOR3 MOTOR4 LIDAR THRUST ACC_X ACC_Y ACC_Z
1 1019.44 1298 1298 1298 1298 0.047 0.000 -0.906 0.055 -9.837
100 1019.29 1517 1624 1507 1531 0.298 0.686 -0.116 -0.083 -13.008
200 1018.91 1437 1486 1445 1466 3.263 0.521 0.334 -0.291 -9.822
1073 1019.23 1425 1515 1396 1481 0.495 0.512 -0.498 -0.075 -10.002
9999 1019.35 1513 1616 1509 1572 0.177 0.622 -0.233 -0.001 -12.335
32050 1012.42 1535 1525 1461 1462 3.150 0.546 0.091 0.199 -8.652
MAG_Z GYRO_X GYRO_Y GYRO_Z PITCH ROLL YAW TIME LOG_ID LAND_STATUS STATUS
1 0.489 0.015 0.027 -0.003 -0.081 0.016 2.633 64927667 0 0 take_off
100 0.457 -0.221 0.003 -0.055 0.083 -0.015 2.614 66566750 0 0 take_off
200 0.483 -0.008 0.057 -0.003 0.001 0.033 2.628 68336653 0 0 take_off
1073 0.474 0.039 0.024 -0.035 -0.056 0.021 -1.934 84077889 0 1 land
9999 0.463 -0.156 -0.133 -0.032 0.090 -0.011 3.000 42166481 5 0 take_off
32050 0.498 0.016 0.054 -0.022 0.002 -0.031 -1.020 550829712 10 0 flight
In [3]:
display(all_data[all_data.keys()[0:11]].describe().round(3))
BAR MOTOR1 MOTOR2 MOTOR3 MOTOR4 LIDAR THRUST ACC_X ACC_Y ACC_Z MAG_Z
count 88084.000 88084.000 88084.000 88084.000 88084.000 88084.000 88084.000 88084.000 88084.000 88084.000 88084.000
mean 1014.537 1466.298 1510.502 1441.252 1476.070 5.890 0.537 -0.064 -0.064 -9.802 0.482
std 2.757 97.355 100.373 99.777 99.054 8.340 0.105 0.659 0.528 1.357 0.021
min 1009.240 910.000 910.000 910.000 910.000 0.000 0.000 -20.925 -7.646 -47.693 -0.519
25% 1012.170 1434.000 1480.000 1406.000 1445.000 0.059 0.509 -0.378 -0.361 -10.153 0.472
50% 1013.980 1477.000 1517.000 1450.000 1483.000 4.887 0.546 -0.057 -0.097 -9.775 0.484
75% 1017.410 1514.000 1559.000 1492.000 1526.000 7.956 0.585 0.235 0.191 -9.383 0.494
max 1019.680 1879.000 1879.000 1819.000 1878.000 65.324 1.000 16.850 15.148 9.816 0.561
In [4]:
display(all_data[all_data.keys()[11:]].describe().round(3))
GYRO_X GYRO_Y GYRO_Z PITCH ROLL YAW TIME LOG_ID LAND_STATUS
count 88084.000 88084.000 88084.000 88084.000 88084.000 88084.000 8.808400e+04 88084.000 88084.000
mean -0.002 0.000 -0.006 -0.012 0.007 -0.212 2.253052e+08 13.414 0.014
std 0.151 0.230 0.207 0.095 0.066 2.124 2.585364e+08 6.671 0.118
min -4.609 -4.209 -2.315 -1.424 -1.412 -3.141 3.682341e+07 0.000 0.000
25% -0.054 -0.058 -0.034 -0.048 -0.019 -2.549 7.800245e+07 8.000 0.000
50% -0.001 0.001 0.000 -0.008 0.010 -0.136 1.295035e+08 15.000 0.000
75% 0.048 0.058 0.034 0.028 0.040 1.643 2.569603e+08 18.000 0.000
max 2.270 5.734 1.986 0.772 3.120 3.142 1.633813e+09 24.000 1.000

Data visualization

In [5]:
pd.scatter_matrix(all_data[all_data.keys()[0:6]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');
In [6]:
pd.scatter_matrix(all_data[all_data.keys()[6:12]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');
In [7]:
pd.scatter_matrix(all_data[all_data.keys()[11:-4]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');

Particular log id data visual exploration

In [8]:
def show_data_from_logid(log_id, features, time_interval=1.0):
    logid_data = all_data[all_data['LOG_ID'] == log_id]
    
    # process values shifting if necessary
    tmp_features = []
    for item in features:
        if type(item) is tuple:
            feature_name = item[0]
            if len(item) == 3:
                rate = item[2] # scale value
                logid_data[feature_name] = logid_data[feature_name] * rate
            value_shift = item[1] # shift value
            logid_data[feature_name] = logid_data[feature_name] + value_shift                
            tmp_features.append(feature_name)
        else:
            tmp_features.append(item)
    features = tmp_features
        
    # scale time to seconds
    logid_data['TIME'] = logid_data['TIME'] / (0.5e+6)
    
    # prepare x sticks
    x_stick = np.arange(logid_data['TIME'].min(), logid_data['TIME'].max(), time_interval)
    x_stick = pd.DataFrame(x_stick, columns = ['Time'])
    
    # plot data
    fig, ax = plt.subplots(figsize = (14,8))
    logid_data.plot(x = 'TIME', y = features, 
                  xticks=x_stick['Time'], kind = 'line', ax=ax, rot=70);
In [9]:
features2use = ['MOTOR1', 'MOTOR2', 'MOTOR3', 'MOTOR4', ('BAR', -100.2e+3, 100.0)]
show_data_from_logid(1,features2use, 1.5)
C:\Users\sholc2005\Anaconda2\lib\site-packages\ipykernel\__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\sholc2005\Anaconda2\lib\site-packages\ipykernel\__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\sholc2005\Anaconda2\lib\site-packages\ipykernel\__main__.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
In [10]:
features2use = ['LIDAR']
show_data_from_logid(1,features2use, 1.5)
C:\Users\sholc2005\Anaconda2\lib\site-packages\ipykernel\__main__.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
In [11]:
features2use = ['ACC_X','ACC_Y', ('ACC_Z', 8.0), 'PITCH', 'ROLL', 'MAG_Z']
show_data_from_logid(1, features2use, 1.5)
C:\Users\sholc2005\Anaconda2\lib\site-packages\ipykernel\__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\sholc2005\Anaconda2\lib\site-packages\ipykernel\__main__.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Data preprocessing

Relevant features selection

In [12]:
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeRegressor

# create dummies
def preprocess_dumies(X):
    outX = pd.DataFrame(index=X.index) # output dataframe, initially empty
    for col, col_data in X.iteritems():
    # If data type is non-numeric, try to replace all yes/no values with 1/0
        if col_data.dtype == object and col=='STATUS':
            col_data = pd.get_dummies(col_data, prefix=col) # e.g. 'status' => 'status_flight', 'status_take_off'
        outX = outX.join(col_data) # collect column(s) in output dataframe
    return outX

print all_data.index
all_data = preprocess_dumies(all_data) 
determ_result = None
keys = all_data.keys()
print keys
features2ignore = ['TIME', 'LOG_ID', 'LAND_STATUS', 'index']

def remove_features(all_features, features_to_remove):
    # remove features from the list if they exists
    new_list = []
    for feature in all_features:
        if feature in features_to_remove:
            continue
        else:
            new_list.append(feature)
    return new_list

# make determinant analysis
determ_result = pd.DataFrame(columns=['Feature', 'Score_value'])
regressor = DecisionTreeRegressor(random_state=333)
row_counter = 0
for key in all_data.keys():
    if key not in features2ignore:
        # get y data
        y_data = all_data[key]
        
        # get x data
        x_features = remove_features(all_data.keys(), features2ignore + [key])
        x_data = all_data[x_features]

        X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25, random_state = 333)

        regressor.fit(X_train, y_train)
        score = regressor.score(X_test, y_test)
        determ_result.loc[row_counter] = [key, score]
        row_counter += 1
display(determ_result)
C:\Users\sholc2005\Anaconda2\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
RangeIndex(start=0, stop=88084, step=1)
Index([u'BAR', u'MOTOR1', u'MOTOR2', u'MOTOR3', u'MOTOR4', u'LIDAR', u'THRUST',
       u'ACC_X', u'ACC_Y', u'ACC_Z', u'MAG_Z', u'GYRO_X', u'GYRO_Y', u'GYRO_Z',
       u'PITCH', u'ROLL', u'YAW', u'TIME', u'LOG_ID', u'LAND_STATUS',
       u'STATUS_flight', u'STATUS_land', u'STATUS_take_off'],
      dtype='object')
Feature Score_value
0 BAR 0.980263
1 MOTOR1 0.980727
2 MOTOR2 0.984043
3 MOTOR3 0.979196
4 MOTOR4 0.983169
5 LIDAR 0.760819
6 THRUST 0.947038
7 ACC_X 0.841638
8 ACC_Y 0.834753
9 ACC_Z 0.875048
10 MAG_Z 0.972091
11 GYRO_X 0.740670
12 GYRO_Y 0.757531
13 GYRO_Z 0.911302
14 PITCH 0.907833
15 ROLL 0.899209
16 YAW 0.954149
17 STATUS_flight 1.000000
18 STATUS_land 1.000000
19 STATUS_take_off 1.000000

Make features' values positive

In [13]:
from sklearn import preprocessing

# gravity constant
g = 9.80665

# features to use
#features2use = ['THRUST', 'ACC_X','ACC_Y','ACC_Z', 'MAG_Z', 'PITCH', 'ROLL']

features2use = ['MOTOR1', 'MOTOR2', 'MOTOR3', 'MOTOR4', 'THRUST', \
                'ACC_X','ACC_Y','ACC_Z', 'MAG_Z', 'PITCH', 'ROLL']

# set min values for each feature
min_vals = {'MOTOR1': 900, 'MOTOR2': 900, 'MOTOR3': 900, 'MOTOR4': 900, 
            'THRUST': 1.0, 
            'ACC_X': 8.0 * g, 'ACC_Y': 8.0 * g, 'ACC_Z': 8.0 * g,
            'MAG_Z': 1.3,
            'PITCH': np.pi, 'ROLL': np.pi, 'YAW': np.pi
            }

# set max values for each feature
max_vals = {'MOTOR1': 1890, 'MOTOR2': 1890, 'MOTOR3': 1890, 'MOTOR4': 1890,
            'THRUST': 1.0 + min_vals['THRUST'],
            'ACC_X': 8.0 * g + min_vals['ACC_X'], 'ACC_Y': 8.0 * g + min_vals['ACC_Y'], 'ACC_Z': 8.0 * g + min_vals['ACC_Z'],
            'MAG_Z': 1.3 + min_vals['MAG_Z'],
            'PITCH': np.pi + min_vals['PITCH'], 'ROLL': np.pi + min_vals['ROLL'], 'YAW': np.pi + min_vals['YAW']}

def make_positive(data, features2use):
    # make all data positive
    
    data_positive = data.copy()

    for key in features2use:
        # make negative values as positive
        if key in min_vals.keys():
            data_positive[key] = data_positive[key] + min_vals[key]
        else:
            min_val = data_positive[key].min()
            if min_val <= 0.0:
                data_positive[key] = data_positive[key] + min_val
        
        
    return data_positive.reset_index(drop=True)

'''
        if key in features2use:
            if key not in ['MOTOR1', 'MOTOR2', 'MOTOR3', 'MOTOR4']:
                    data_positive[key] = data_positive[key] + min_vals[key]
            else:
                min_val = data_positive[key].min()
                if min_val <= 0.0:
                    data_positive[key] = data_positive[key] + min_val
'''

def scale(data, features2scale, min_=0.00001, max_=255.0):
    # scale data in a given range
    scaled = data.copy()
    
    for key in scaled.keys():
        if key in features2scale:
            scaled[key] = ((max_ - min_) / (data[key].max() - data[key].min())) * (scaled[key] -  data[key].min()) + min_
        
    return scaled.reset_index(drop=True)
In [14]:
data_positive = make_positive(all_data, features2use)
data_scaled = scale(data_positive, features2use)
In [15]:
print "Features before applying: make positive and scaling:"
display(data_positive[features2use].describe())

print "Features with positive values:"
display(data_positive[features2use].describe())

print "Scaled features:"
display(data_scaled[features2use].describe())
Features before applying: make positive and scaling:
MOTOR1 MOTOR2 MOTOR3 MOTOR4 THRUST ACC_X ACC_Y ACC_Z MAG_Z PITCH ROLL
count 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000
mean 2366.297852 2410.501919 2341.252009 2376.069797 1.536727 78.388910 78.388942 68.651312 1.781749 3.129401 3.148578
std 97.354726 100.372674 99.776605 99.053814 0.105075 0.658954 0.527975 1.357033 0.020826 0.094939 0.066477
min 1810.000000 1810.000000 1810.000000 1810.000000 1.000000 57.528275 70.807159 30.760005 0.780630 1.717451 1.730067
25% 2334.000000 2380.000000 2306.000000 2345.000000 1.509179 78.074852 78.092560 68.299778 1.772462 3.093987 3.122127
50% 2377.000000 2417.000000 2350.000000 2383.000000 1.546479 78.396069 78.355883 68.678271 1.784339 3.133144 3.151793
75% 2414.000000 2459.000000 2392.000000 2426.000000 1.585419 78.688465 78.644223 69.070576 1.794051 3.169319 3.181318
max 2779.000000 2779.000000 2719.000000 2778.000000 2.000000 95.303137 93.600830 88.268961 1.860690 3.913941 6.261976
Features with positive values:
MOTOR1 MOTOR2 MOTOR3 MOTOR4 THRUST ACC_X ACC_Y ACC_Z MAG_Z PITCH ROLL
count 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000
mean 2366.297852 2410.501919 2341.252009 2376.069797 1.536727 78.388910 78.388942 68.651312 1.781749 3.129401 3.148578
std 97.354726 100.372674 99.776605 99.053814 0.105075 0.658954 0.527975 1.357033 0.020826 0.094939 0.066477
min 1810.000000 1810.000000 1810.000000 1810.000000 1.000000 57.528275 70.807159 30.760005 0.780630 1.717451 1.730067
25% 2334.000000 2380.000000 2306.000000 2345.000000 1.509179 78.074852 78.092560 68.299778 1.772462 3.093987 3.122127
50% 2377.000000 2417.000000 2350.000000 2383.000000 1.546479 78.396069 78.355883 68.678271 1.784339 3.133144 3.151793
75% 2414.000000 2459.000000 2392.000000 2426.000000 1.585419 78.688465 78.644223 69.070576 1.794051 3.169319 3.181318
max 2779.000000 2779.000000 2719.000000 2778.000000 2.000000 95.303137 93.600830 88.268961 1.860690 3.913941 6.261976
Scaled features:
MOTOR1 MOTOR2 MOTOR3 MOTOR4 THRUST ACC_X ACC_Y ACC_Z MAG_Z PITCH ROLL
count 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000 88084.000000
mean 146.394176 158.026824 149.031096 149.119630 136.865431 140.820160 84.819813 168.013545 236.362239 163.919349 79.816348
std 25.619664 26.413860 27.990136 26.093721 26.794079 4.448280 5.906627 6.017210 4.916946 11.021920 3.740517
min 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
25% 137.894741 150.000004 139.141919 140.934922 129.840710 138.700101 81.504095 166.454812 234.169633 159.808026 78.327999
50% 149.210530 159.736846 151.485153 150.945252 139.352131 140.868481 84.449974 168.133084 236.973706 164.353955 79.997245
75% 158.947372 170.789477 163.267330 162.272731 149.281895 142.842307 87.675726 169.872602 239.266854 168.553596 81.658533
max 255.000000 255.000000 255.000000 255.000000 255.000000 255.000000 255.000000 255.000000 255.000000 255.000000 255.000000
In [16]:
pd.scatter_matrix(data_scaled[features2use[0:5]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');
In [17]:
pd.scatter_matrix(data_scaled[features2use[6:12]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');

Outliers detection

In [18]:
def update_dictionary(ind_dic, indices):
    # count indices found in outliers for features
    for value in indices:
        if ind_dic.has_key(value):
            ind_dic[value] = ind_dic[value] + 1
        else:
            ind_dic[value] = 1

def display_indices(ind_dic):
    # display how many times sample is detected in the outliers
    
    items = ind_dic.items()
    items.sort(key=lambda tup: tup[1], reverse=True)
    for item in items:
        if item[1] > 1:
            print "{:3d}: repeats in {:d} times".format(item[0], item[1])

indices_dic = {} # dictionary for indices in outliers
all_outliers = [] # list of outliers to revoke

for feature in features2use:
    
    # TODO: Calculate Q1 (10th percentile of the data) for the given feature
    Q1 = np.percentile(data_scaled[feature], 10.0)
    
    # TODO: Calculate Q3 (90th percentile of the data) for the given feature
    Q3 = np.percentile(data_scaled[feature], 90.0)
    
    # TODO: Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
    
    step = 1.7 * (Q3 - Q1)
    
    print "\nData points considered outliers for the feature '{}':".format(feature)
    print "Q1-step={:.3f}, Q3+step={:.3f}, step={:.3f}".format(Q1-step, Q3+step, step)
    
    # Display the outliers
    cols = [feature, 'LAND_STATUS', 'STATUS_flight', 'STATUS_land', 'STATUS_take_off']
    outliers = data_scaled[~((data_scaled[feature] >= Q1 - step) & 
                             (data_scaled[feature] <= Q3 + step)) & 
                           ~((data_scaled['STATUS_land'] == 1.0) & (data_scaled['LAND_STATUS'] == 1.0))]
    #all_outliers = all_outliers + list(outliers.index)
    update_dictionary(indices_dic, list(outliers.index))
    #display(outliers[cols])
Data points considered outliers for the feature 'MOTOR1':
Q1-step=40.868, Q3+step=250.447, step=80.974

Data points considered outliers for the feature 'MOTOR2':
Q1-step=53.579, Q3+step=264.316, step=81.421

Data points considered outliers for the feature 'MOTOR3':
Q1-step=23.536, Q3+step=274.104, step=96.810

Data points considered outliers for the feature 'MOTOR4':
Q1-step=40.489, Q3+step=259.557, step=84.640

Data points considered outliers for the feature 'THRUST':
Q1-step=34.542, Q3+step=235.443, step=77.621

Data points considered outliers for the feature 'ACC_X':
Q1-step=121.922, Q3+step=159.806, step=14.637

Data points considered outliers for the feature 'ACC_Y':
Q1-step=57.132, Q3+step=112.827, step=21.518

Data points considered outliers for the feature 'ACC_Z':
Q1-step=149.033, Q3+step=187.753, step=14.960

Data points considered outliers for the feature 'MAG_Z':
Q1-step=216.315, Q3+step=256.465, step=15.512

Data points considered outliers for the feature 'PITCH':
Q1-step=121.941, Q3+step=206.723, step=32.757

Data points considered outliers for the feature 'ROLL':
Q1-step=63.294, Q3+step=96.205, step=12.716

ALL Motors - OK Thrust outliers - OK ACC_X: 75.0 - 190.0 ACC_Y: 40.0 - 175.0 ACC_Z: 75.0 - 220.0 MAG_Z: 150.0 - Pitch: 80.0 - 245.0 Roll: 25.0 - 125.0

In [19]:
outliers_dict = {
                 'MOTOR1': (40.868, 250.447),
                 'MOTOR2': (53.579, 264.316),
                 'MOTOR3': (23.536, 274.104),
                 'MOTOR4': (40.489, 259.557),
                 'THRUST': (34.542, 235.443),
                 'ACC_X': (75.0, 190.0),
                 'ACC_Y': (40.0, 175.0),
                 'ACC_Z': (75.0, 220.0),
                 'MAG_Z': (150.0, None),
                 'PITCH': (80.0, 245.0),
                 'ROLL': (25.0, 125.0)}

def cut_outliers(input_dataset, params):
    # cuts the outliers of particular feature in a given range
    
    # get range for each feature
    dataset = input_dataset.copy()
    columns = input_dataset.columns
    indexes = []
    for key in params.keys():
        if key in columns:
            print key
            #continue
            
        range_from = params[key][0]
        range_to = params[key][1]
        
        if range_from is None:
            range_from = -np.inf
            print "range_from", range_from
        if range_to is None:
            print "range_to", range_to
            range_to = np.inf
        
        sub_dataset = dataset[~((dataset[key] >= range_from) & (dataset[key] <= range_to))]
        dataset= dataset.drop(sub_dataset.index)
#        indexes += list(sub_dataset.index)
#        print len(indexes)
        
#    indexes = list(np.unique(np.array(indexes, dtype = np.int64)))
   
#    sub_set = dataset.ix[indexes]
    return dataset
In [20]:
#print data_scaled.shape
filtered_outliers = cut_outliers(data_scaled, outliers_dict)
THRUST
ACC_Z
ACC_Y
MAG_Z
range_to None
MOTOR4
MOTOR3
MOTOR2
MOTOR1
PITCH
ACC_X
ROLL
In [21]:
display(filtered_outliers.describe())
BAR MOTOR1 MOTOR2 MOTOR3 MOTOR4 LIDAR THRUST ACC_X ACC_Y ACC_Z ... GYRO_Z PITCH ROLL YAW TIME LOG_ID LAND_STATUS STATUS_flight STATUS_land STATUS_take_off
count 86575.000000 86575.000000 86575.000000 86575.000000 86575.000000 86575.000000 86575.000000 86575.000000 86575.000000 86575.000000 ... 86575.000000 86575.000000 86575.000000 86575.000000 8.657500e+04 86575.000000 86575.000000 86575.000000 86575.000000 86575.000000
mean 1014.528161 146.472485 158.382340 149.090427 149.392034 5.962414 136.354502 140.878211 84.853294 168.234194 ... -0.005463 164.005948 79.822421 -0.198400 2.262052e+08 13.383679 0.013768 0.569576 0.301161 0.129264
std 2.758743 22.528257 22.720545 25.111304 22.946512 8.383467 23.491966 3.888920 5.594636 5.264307 ... 0.204944 10.577573 3.624692 2.122542 2.597899e+08 6.676389 0.116529 0.495138 0.458765 0.335493
min 1009.239990 43.421061 53.684218 43.481856 43.465917 0.000000 63.750038 75.113021 42.753374 81.316028 ... -2.315323 85.817994 30.673839 -3.141173 3.738236e+07 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1012.159973 138.157899 150.263162 139.422447 141.198352 0.058000 129.990103 138.723893 81.544778 166.507068 ... -0.033689 159.829754 78.330846 -2.521631 7.809806e+07 8.000000 0.000000 0.000000 0.000000 0.000000
50% 1013.969971 149.210530 159.736846 151.485153 150.945252 5.001000 139.301320 140.881648 84.472506 168.153191 ... 0.000449 164.366613 80.000907 -0.106443 1.295303e+08 14.000000 0.000000 1.000000 0.000000 0.000000
75% 1017.429993 158.684214 170.526319 162.986802 162.009301 7.992501 149.006151 142.845024 87.700345 169.897090 ... 0.033867 168.559871 81.663282 1.648545 2.583633e+08 18.000000 0.000000 1.000000 1.000000 0.000000
max 1019.669983 247.631579 255.000000 255.000000 255.000000 65.324005 234.873986 175.567120 166.557003 207.164207 ... 1.986229 240.668964 101.765360 3.141524 1.633813e+09 24.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 23 columns

In [22]:
pd.scatter_matrix(filtered_outliers[features2use[0:6]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');
In [23]:
pd.scatter_matrix(filtered_outliers[features2use[6:12]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');

Data scaling

Apply logarithm function for all features

In [24]:
def log_scale(dataset):
    # scale features using logarithm
    log_data = dataset.copy()
    for feature in features2use:
        log_data[feature] = np.log(log_data[feature])
    return log_data
In [25]:
all_log_data = log_scale(filtered_outliers)
display(all_log_data.describe())
BAR MOTOR1 MOTOR2 MOTOR3 MOTOR4 LIDAR THRUST ACC_X ACC_Y ACC_Z ... GYRO_Z PITCH ROLL YAW TIME LOG_ID LAND_STATUS STATUS_flight STATUS_land STATUS_take_off
count 86575.000000 86575.000000 86575.000000 86575.000000 86575.000000 86575.000000 86575.000000 86575.000000 86575.000000 86575.000000 ... 86575.000000 86575.000000 86575.000000 86575.000000 8.657500e+04 86575.000000 86575.000000 86575.000000 86575.000000 86575.000000
mean 1014.528161 4.972310 5.052516 4.986824 4.992131 5.962414 4.896233 4.947506 4.438873 5.124843 ... -0.005463 5.097640 4.378657 -0.198400 2.262052e+08 13.383679 0.013768 0.569576 0.301161 0.129264
std 2.758743 0.181450 0.167836 0.201671 0.180843 8.383467 0.209530 0.028129 0.063357 0.032556 ... 0.204944 0.069065 0.049574 2.122542 2.597899e+08 6.676389 0.116529 0.495138 0.458765 0.335493
min 1009.239990 3.770945 3.983119 3.772344 3.771977 0.000000 4.154970 4.318994 3.755448 4.398343 ... -2.315323 4.452229 3.423410 -3.141173 3.738236e+07 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1012.159973 4.928397 5.012388 4.937509 4.950166 0.058000 4.867458 4.932486 4.401152 5.115038 ... -0.033689 5.074109 4.360941 -2.521631 7.809806e+07 8.000000 0.000000 0.000000 0.000000 0.000000
50% 1013.969971 5.005358 5.073528 5.020488 5.016917 5.001000 4.936639 4.947920 4.436426 5.124875 ... 0.000449 5.102099 4.382038 -0.106443 1.295303e+08 14.000000 0.000000 1.000000 0.000000 0.000000
75% 1017.429993 5.066916 5.138890 5.093669 5.087654 7.992501 5.003988 4.961760 4.473926 5.135193 ... 0.033867 5.127291 4.402604 1.648545 2.583633e+08 18.000000 0.000000 1.000000 1.000000 0.000000
max 1019.669983 5.511942 5.541264 5.541264 5.541264 65.324005 5.459049 5.168021 5.115338 5.333512 ... 1.986229 5.483422 4.622670 3.141524 1.633813e+09 24.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 23 columns

In [26]:
pd.scatter_matrix(all_log_data[features2use[0:6]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');
In [27]:
pd.scatter_matrix(all_log_data[features2use[6:12]], alpha = 0.3, figsize = (24,8), diagonal = 'kde');

Features Transformation (ICA/FA)

ICA analysis

In [28]:
from scipy.stats import shapiro

# check data for normal distributed with Shapiro-Wilk test
# this necessary for ICA using
for feature in features2use:
    shapiro_w, shapiro_p = shapiro(all_log_data[feature])
    print "Feature: {:s}, w={:.3f}, p={:.3f}".format(feature, shapiro_w, shapiro_p)
Feature: MOTOR1, w=0.804, p=0.000
Feature: MOTOR2, w=0.803, p=0.000
Feature: MOTOR3, w=0.810, p=0.000
Feature: MOTOR4, w=0.808, p=0.000
Feature: THRUST, w=0.741, p=0.000
Feature: ACC_X, w=0.901, p=0.000
Feature: ACC_Y, w=0.947, p=0.000
Feature: ACC_Z, w=0.756, p=0.000
Feature: MAG_Z, w=0.743, p=0.000
Feature: PITCH, w=0.786, p=0.000
Feature: ROLL, w=0.761, p=0.000
C:\Users\sholc2005\Anaconda2\lib\site-packages\scipy\stats\morestats.py:1329: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")
In [29]:
features2use
Out[29]:
['MOTOR1',
 'MOTOR2',
 'MOTOR3',
 'MOTOR4',
 'THRUST',
 'ACC_X',
 'ACC_Y',
 'ACC_Z',
 'MAG_Z',
 'PITCH',
 'ROLL']

FA analysis

In [30]:
from sklearn.decomposition import FastICA
from sklearn.decomposition import FactorAnalysis

# make FA analysis
fa_ = FactorAnalysis(n_components = 6, random_state = 333)

# get only landing data for FA analysis
land_data = all_log_data[all_log_data['STATUS_land'] == 1.0]

land_data = land_data[features2use]

fa_results = fa_.fit_transform(land_data)
C:\Users\sholc2005\Anaconda2\lib\site-packages\sklearn\decomposition\factor_analysis.py:224: ConvergenceWarning: FactorAnalysis did not converge. You might want to increase the number of iterations.
  ConvergenceWarning)
In [31]:
def plot_fa_results(data, fa):
    
    # Dimension indexing
    dimensions = ['Component {}'.format(i) for i in range(1, len(fa.components_) + 1)]

    # FA components
    
    components = pd.DataFrame(np.round(fa.components_, 4), columns = data.keys())
    components.index = dimensions

    # Create a bar plot visualization
    fig, ax = plt.subplots(figsize = (14,8))

    # Plot the feature weights as a function of the components
    components.plot(ax = ax, kind = 'bar', legend = True, colormap = "Paired");
    ax.set_ylabel("Features' unmixed matrix")
    ax.set_xticklabels(dimensions, rotation=0)
    #patches, labels = ax.get_legend_handles_labels()
    #ax.legend(patches, labels, loc='best')

# plot FA components
#col_names = all_log_data[features4use].keys()
fa_data = pd.DataFrame(fa_.components_, columns=features2use)
plot_fa_results(fa_data, fa_)

Vizualize FA results on a particular log file

In [32]:
def get_data_intervals(timeseries, interval=3.0):
    # returns intervals of contin.time
    interv = []
    prev = min(timeseries)
    iterval_start = prev
    for i in range(1, len(timeseries)):
        if timeseries[i] - prev > interval:
            interv.append((iterval_start, timeseries[i - 1]))
            iterval_start = timeseries[i]
        prev = timeseries[i]
    interv.append((iterval_start, timeseries[-1]))
    return interv

def vizualize_fa(dataset, log_id, fa, features, drone_state=None, use_components=[]):
    # vizualize fa analysis for the particular log id
    
    if drone_state is not None:
        logid_data = dataset[(dataset['LOG_ID'] == log_id) & 
                             (dataset['STATUS_' + drone_state])].reset_index(drop=True)
        title = 'FA analysis: logID={:d}, drone status {:s}'.format(log_id, drone_state)
    else:
        logid_data = dataset[(dataset['LOG_ID'] == log_id)].reset_index(drop=True)
        title = 'FA analysis: logID={:d}, drone status ALL'.format(log_id)
    fa_data = fa.transform(logid_data[features])
    
    components_names = ['Component {}'.format(i) for i in range(1, len(fa.components_) + 1)]
    

        
    fa_logid = pd.DataFrame(fa_data, columns=components_names).reset_index(drop=True)
    
    data_to_plot = pd.concat([logid_data, fa_logid], axis=1).reset_index(drop=True)
    #data_to_plot['LAND_STATUS'] = data_to_plot['LAND_STATUS'] - 2.0 # shift from x axis
    #data_to_plot['STATUS_land'] = data_to_plot['STATUS_land']# shift from x axis
    
    data_to_plot['TIME'] = data_to_plot['TIME'] / 1.0e+6 # convert to seconds
    
    # get only necessary components
    if len(use_components) > 0:
        tmp = []
        for j in range(0, len(components_names)):
            if j in use_components:
                tmp.append(components_names[j])
        components_names = tmp
        #print components_names
        
    necessary_columns = components_names + ['LAND_STATUS', 'STATUS_land']
    
    # check empty time intervals and remove them from plots
    time_intervals = get_data_intervals(list(data_to_plot['TIME']))
    #if len(time_intervals) == 0:
    #    time_intervals = [(data_to_plot['TIME'].min(), data_to_plot['TIME'].max())]
    #print time_intervals
    
    for time_start, time_stop in time_intervals:
        interval_data = data_to_plot[(data_to_plot['TIME'] >= time_start) & 
                                     (data_to_plot['TIME'] <= time_stop)].reset_index(drop=True)
        

        # prepare xticks for time ploting
        x_ticks = interval_data['TIME']
        x_ticks_indexes = range(0, len(x_ticks), int(len(x_ticks) / (6 * 8) + 0.5))
        x_ticks = list(np.round(x_ticks[x_ticks_indexes].reset_index(drop=True), 2))


        fig, ax = plt.subplots(figsize = (14,8))
        
        if drone_state is None:
            y_vals = necessary_columns
        else:
            y_vals = necessary_columns[:-1]

            
        interval_data.plot(y = y_vals, kind = 'line', x = ['TIME'],
                          ax=ax, rot=70, xticks=x_ticks, grid = True,
                          )

        ax.set_title(title + ", Time inteval {:.2f}-{:.2f}".format(time_start, time_stop))
In [33]:
vizualize_fa(all_log_data, 19, fa_, features2use, use_components=[3,4])
vizualize_fa(all_log_data, 19, fa_, features2use, drone_state='land', use_components=[3,4])
In [34]:
for log_id in range(0, 25):
    vizualize_fa(all_log_data, log_id, fa_, features2use, use_components=[3,4])
    vizualize_fa(all_log_data, log_id, fa_, features2use, drone_state='land', use_components=[3,4])
C:\Users\sholc2005\Anaconda2\lib\site-packages\matplotlib\pyplot.py:516: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

Clustering

In [35]:
from sklearn.cluster import KMeans
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import silhouette_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
import matplotlib.cm as cm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import svm
In [36]:
def remove_outliers_of_land_status(dataset, time_interval=0.8):
    # remove first 0.8 sec data from every log in the dataset in drone state Landing
    
    logid_list = dataset['LOG_ID'].unique()
    filtered_dataset = None
    
    # process each log_id
    for log_id in logid_list:
        
        # get logid landing data 
        logid_data = dataset[(dataset['LOG_ID'] == log_id) & (dataset['STATUS_land'] == 1.0)]
        
        # get time intervals
        time_intervals = get_data_intervals(list(logid_data['TIME']), 3.0 * 1.0e+6) # 3 sec
        #print log_id, np.array(time_intervals) / 1.0e+6
        
        # filter out time interval from begining of landing data
        for time_start, time_stop in time_intervals:
            time_interval_data = logid_data[(logid_data['TIME'] >= time_start + time_interval * 1.0e+6) &
                                             (logid_data['TIME'] <= time_stop)]
            #print time_interval_data.shape
            
            time_interval_data = time_interval_data.reset_index(drop=True)
            
            if filtered_dataset is None:
                filtered_dataset = time_interval_data
            else:
                filtered_dataset = filtered_dataset.reset_index(drop=True)
                filtered_dataset = pd.concat([filtered_dataset, time_interval_data])
                
    return filtered_dataset
        
In [37]:
check_data = remove_outliers_of_land_status(all_log_data)
In [38]:
vizualize_fa(check_data, 19, fa_, features2use)

Measure accuracy

In [39]:
def measure_accuracy(logid_list, dataset, preds, touch_detect_period=None):
    # measure avg recall score, precision and f1 score for data in logs and report recall score for all ids
    # for measuring using only landing state of the drone
    
    dataset_ = dataset.copy()
    dataset_.reset_index(drop=True, inplace=True)
    preds_ = pd.DataFrame(preds, columns=['Preds'])
    preds_.reset_index(drop=True, inplace=True)
    
    # concat dataset with preds column
    dataset_ = pd.concat([dataset_, preds_], axis=1)
    
    #recall_score = []
    #precision_score = []
    #f1_score = []
    
    if len(logid_list) == 0:
        # use all logs
        logid_list = list(dataset_['LOG_ID'].unique())
    #print logid_list
    # process each log
    
    # init counters
    pfp = 0.0 # predicted False Positive
    pfn = 0.0 # predicted False Negative
    ptn = 0.0 # predicted True Negative
    ptp = 0.0 # predicted True Positive
    ctp = 0.0 # current True Positive
    ctn = 0.0 # current True Negative
    
    for logid in logid_list:
        logid_data = remove_outliers_of_land_status(dataset_[dataset_['LOG_ID'] == logid]) # remove first 0.8 sec data from every log
        
        time_intervals = get_data_intervals(list(logid_data['TIME']), 3.0 * 1.0e+6) # 3 sec - theshold of time between intevals, all time intervals in the log
        

           
        num_intervals = float(len(time_intervals)) # counter, number of time intervals
        
        # process each landing time interval
        for time_start, time_stop in time_intervals:
            # get time interval data
            
            time_interval_data = logid_data[(logid_data['TIME'] >= time_start) &
                                             (logid_data['TIME'] <= time_stop)] 
            
            # get only touch data inside the time interval. 
            # Strongly that in landing time interval may be ony one touch moment
            touch_moment_data = time_interval_data[time_interval_data['LAND_STATUS'] == 1.0]
            
            # check if touching moment exists in the time period data
            # determine overlapping of touch moment and clustering results
            if touch_moment_data.shape[0] > 0:
                ctp += 1.0 # True Positive (inside the touch moment)
                # get touch moment's time interval inside the time interval
                touch_start_time = touch_moment_data['TIME'].min()
                touch_stop_time = touch_moment_data['TIME'].max()
                
                #print "diff", (touch_stop_time - touch_start_time) /1.0e+6
                
                if touch_detect_period is not None:
                    shift = touch_detect_period * 1.0e+6
                    #if touch_stop_time - touch_start_time < touch_detect_period:
                    touch_stop_time = touch_start_time + shift
                    #print "diff2", (touch_stop_time - touch_start_time) /1.0e+6
                    touch_moment_data = time_interval_data[(time_interval_data['TIME'] >= touch_start_time) & 
                                                          (time_interval_data['TIME'] <= touch_stop_time)]
                    
                # get time interval's data before the touch moment
                data_before_touch_moment = time_interval_data[(time_interval_data['TIME'] < touch_start_time) &
                                                             (time_interval_data['TIME'] >= time_start)]
                if data_before_touch_moment.shape[0] > 0:
                    ctn += 1.0 # True Negative
                                # process only with touch moment data presents in the time period
                    if data_before_touch_moment['Preds'].sum() >= 1.0:
                        pfp += 1.0 # predicted before time period (False Positive)
                        #print "predict: Log", logid, "FP before touch", data_before_touch_moment['Preds'].sum()
                    else:
                        ptn += 1.0 # predicted before time period (True Negative)
                
                # get time interval's data after the touch moment
                data_after_touch_moment = time_interval_data[(time_interval_data['TIME'] > touch_stop_time) &
                                                            (time_interval_data['TIME'] <= time_stop)]
#                if data_after_touch_moment.shape[0] > 0:
#                    ctn += 1.0 # True Negative 
#                    if data_after_touch_moment['Preds'].sum() >= 1.0:
#                        pfp += 1.0 # predicted after time period (False Positive)
#                    else:
#                        ptn += 1.0 # predicted after time period (True Negative)
                
                # check prediction inside touch moment
                if touch_moment_data['Preds'].sum() >= 1.0:
                    ptp += 1.0 # predicted in time period (True Positive)
                    #print "predict: Log", logid, "TP", touch_moment_data['Preds'].sum()
                else:
                    pfn += 1.0 # predicted in time period (False Negative)
                    #print "predict: Log", logid, "FN"
                    
            else:
                # no touch moment in time interval
                ctn += 1.0 # True Negative
                
                if time_interval_data['Preds'].sum() >= 1.0:
                    pfp += 1.0 # predicted in time period (False Positive)
                else:
                    ptn += 1.0 # predicted after time period (True Negative)
            
            

        
    tp = ptp / ctp # estimate True Positive rate
    fn = pfn / ctp # estimate False Negative rate

    #print tp, fn

    recall_ = tp / (tp + fn)
    #recall_score.append(recall_)

    fp = pfp / ctn # estimate False Positive rate

    if (tp + fp) == 0.0:
        precision_ = 0.0
    else:
        precision_ = tp / (tp + fp)
    #precision_score.append(precision_)

    if (precision_ + recall_) == 0.0:
        f1_ = 0.0
    else:
        f1_ = 2 * (precision_ * recall_) / (precision_ + recall_)
    #f1_score.append(f1_)
    
    return {'recall': recall_, 'precision':precision_, 'f1':f1_}
                
            
        
In [40]:
import time
def clusterize(clf, title="", touch_detect_period=None, comp_num=5):
    # clusterize data, deviding dataset into learning and testin data
    log_ids = np.arange(all_log_data['LOG_ID'].max() / 2)



    highest_prec = -np.inf
    highest_f1 = -np.inf
    best_kmeans = None
    best_fa_data = None
    best_preds = None
    split_num = 10
    learn_results = pd.DataFrame(columns=['SplitNum',  
                                          'F1_train', 'Prec_train', 'Recall_train',
                                          'F1_valid', 'Prec_valid', 'Recall_valid',
                                          'F1_test', 'Prec_test', 'Recall_test',
                                          'Silhouette', 'PredictTime', 'LearningTime', 'AIC', 'BIC'])
    counter = 0

    spl_num = 1
    print "Using:", title
    test_log_ids = np.arange(max(log_ids) + 1, 25)
    print "Logs to use in testing", test_log_ids
    only_test_data = all_log_data[(all_log_data['LOG_ID'].isin(test_log_ids)) &
                                 (all_log_data['STATUS_land'] == 1.0)].reset_index(drop=True)
    only_test_data = remove_outliers_of_land_status(only_test_data)
    otest_fa_data = fa_.transform(only_test_data[features2use])

    print "Logs to use in learning", log_ids
    ss = ShuffleSplit(n_splits=split_num, test_size=0.35, random_state=777)
    for train_index, test_index in ss.split(log_ids):
    
        train_data = all_log_data[(all_log_data['LOG_ID'].isin(train_index)) &
                                 (all_log_data['STATUS_land'] == 1.0)].reset_index(drop=True)
        train_data = remove_outliers_of_land_status(train_data)

        print "Split id: {:d}".format(spl_num)
        label0 = len(train_data[train_data['LAND_STATUS'] == 0.0])
        print "Label 0 size in the training set:", label0
        print "Label 1 size in the training set:", len(train_data[train_data['LAND_STATUS'] == 1.0])

        test_data = all_log_data[(all_log_data['LOG_ID'].isin(test_index)) &
                                (all_log_data['STATUS_land'] == 1.0)].reset_index(drop=True)
        test_data = remove_outliers_of_land_status(test_data)


        tr_fa_data = fa_.transform(train_data[features2use])
        tr_fa_data_y = train_data['LAND_STATUS'].reset_index(drop=True)

        print "Training...."
        #preds_train = clf.fit_predict(tr_fa_data) # Kmeans
        #preds_train =
        t0 = time.time() 
        clf.fit(tr_fa_data[:,0:comp_num], tr_fa_data_y)
        tlearn = time.time() - t0
        preds_train = clf.predict(tr_fa_data[:,0:comp_num])

        sil_sc = silhouette_score(X=tr_fa_data, labels=tr_fa_data_y, sample_size=1000);
        tr_acc = measure_accuracy([], train_data, preds_train, touch_detect_period=touch_detect_period) # learning accuracy

        tst_fa_data = fa_.transform(test_data[features2use])
        preds_test = clf.predict(tst_fa_data[:,0:comp_num])
        tst_acc = measure_accuracy([], test_data, preds_test) # validation accuracy
        t1 = time.time()
        preds_otest = clf.predict(otest_fa_data[:,0:comp_num])
        t2 = time.time() - t1

        test_acc = measure_accuracy([], only_test_data, preds_otest) # testing accuracy
        if title=='GMM':
            aikbic = [clf.aic(tr_fa_data[:,0:comp_num]), clf.bic(tr_fa_data[:,0:comp_num])]
        else:
            aikbic = [0.0, 0.0]
        learn_results.loc[counter] = [spl_num, 
                                      tr_acc['f1'], tr_acc['precision'], tr_acc['recall'],
                                      tst_acc['f1'], tst_acc['precision'], tst_acc['recall'],
                                      test_acc['f1'], test_acc['precision'], test_acc['recall'],
                                      sil_sc, t2, tlearn] + aikbic

        counter += 1

        spl_num += 1
        if  test_acc['precision'] > highest_prec:
            highest_prec = test_acc['precision']
            best_kmeans_prec = clf
            best_fa_data = fa_data
            best_preds = preds_train

        if test_acc['f1'] > highest_f1:
            highest_f1 = test_acc['f1']
            best_kmeans_f1 = clf
            best_fa_data = fa_data
            best_preds = preds_train

    if title != "GMM":
        learn_results.plot(kind='line', x=['SplitNum'], y=['F1_train', 'F1_valid', 'F1_test', 'Silhouette'], title=title)
        learn_results.plot(kind='line', x=['SplitNum'], y=['Recall_train', 'Recall_valid', 'Recall_test'], title=title)
        learn_results.plot(kind='line', x=['SplitNum'], y=['Prec_train', 'Prec_valid', 'Prec_test'], title=title)
    else:
        learn_results.plot(kind='line', x=['SplitNum'], y=['AIC', 'BIC'], title=title)
        learn_results.plot(kind='line', x=['SplitNum'], y=['F1_train', 'F1_valid', 'F1_test'], title=title)
        learn_results.plot(kind='line', x=['SplitNum'], y=['Recall_train', 'Recall_valid', 'Recall_test'], title=title)
        learn_results.plot(kind='line', x=['SplitNum'], y=['Prec_train', 'Prec_valid', 'Prec_test'], title=title)
        
    return best_kmeans_f1, best_kmeans_prec, learn_results
In [41]:
from sklearn import mixture
from sklearn.cluster import SpectralClustering

clf_Kmeans = KMeans(n_clusters=2, n_jobs=1, random_state=1111)
clf_GMM = mixture.GaussianMixture(n_components=2, random_state=1111)
clf_spcl = SpectralClustering(n_clusters=2, random_state=1111)
best_f1_r1,_,r1 = clusterize(clf_Kmeans, title="K-Means")
print r1
best_f1_r2,_,r2 = clusterize(clf_GMM, title="GMM")
print r2
#_,_,r3 = clusterize(clf_spcl, title="Spectral Clustering")
#print r3
Using: K-Means
Logs to use in testing [12 13 14 15 16 17 18 19 20 21 22 23 24]
Logs to use in learning [ 0  1  2  3  4  5  6  7  8  9 10 11]
Split id: 1
Label 0 size in the training set: 5757
Label 1 size in the training set: 434
Training....
Split id: 2
Label 0 size in the training set: 6443
Label 1 size in the training set: 537
Training....
Split id: 3
Label 0 size in the training set: 7470
Label 1 size in the training set: 549
Training....
Split id: 4
Label 0 size in the training set: 6662
Label 1 size in the training set: 552
Training....
Split id: 5
Label 0 size in the training set: 7670
Label 1 size in the training set: 571
Training....
Split id: 6
Label 0 size in the training set: 6848
Label 1 size in the training set: 549
Training....
Split id: 7
Label 0 size in the training set: 7670
Label 1 size in the training set: 571
Training....
Split id: 8
Label 0 size in the training set: 4823
Label 1 size in the training set: 424
Training....
Split id: 9
Label 0 size in the training set: 5353
Label 1 size in the training set: 433
Training....
Split id: 10
Label 0 size in the training set: 4969
Label 1 size in the training set: 441
Training....
   SplitNum  F1_train  Prec_train  Recall_train  F1_valid  Prec_valid  \
0       1.0  0.666667    0.500000           1.0  0.666667    0.500000   
1       2.0  0.941176    0.888889           1.0  1.000000    1.000000   
2       3.0  0.975610    0.952381           1.0  0.833333    0.714286   
3       4.0  0.666667    0.500000           1.0  0.666667    0.500000   
4       5.0  0.969697    0.941176           1.0  0.900000    0.818182   
5       6.0  0.666667    0.500000           1.0  0.666667    0.500000   
6       7.0  0.969697    0.941176           1.0  0.900000    0.818182   
7       8.0  0.666667    0.500000           1.0  0.666667    0.500000   
8       9.0  0.960000    0.923077           1.0  1.000000    1.000000   
9      10.0  0.666667    0.500000           1.0  0.666667    0.500000   

   Recall_valid   F1_test  Prec_test  Recall_test  Silhouette  PredictTime  \
0           1.0  0.666667   0.500000     1.000000    0.361260        0.003   
1           1.0  0.864865   0.872727     0.857143    0.245249        0.003   
2           1.0  0.864865   0.872727     0.857143    0.192009        0.003   
3           1.0  0.634146   0.481481     0.928571    0.168879        0.003   
4           1.0  0.864865   0.872727     0.857143    0.163659        0.003   
5           1.0  0.666667   0.500000     1.000000    0.313369        0.003   
6           1.0  0.864865   0.872727     0.857143    0.286844        0.004   
7           1.0  0.666667   0.500000     1.000000    0.304645        0.002   
8           1.0  0.875912   0.895522     0.857143    0.366807        0.003   
9           1.0  0.666667   0.500000     1.000000    0.376418        0.004   

   LearningTime  AIC  BIC  
0         0.110  0.0  0.0  
1         0.067  0.0  0.0  
2         0.073  0.0  0.0  
3         0.065  0.0  0.0  
4         0.091  0.0  0.0  
5         0.057  0.0  0.0  
6         0.090  0.0  0.0  
7         0.039  0.0  0.0  
8         0.056  0.0  0.0  
9         0.073  0.0  0.0  
Using: GMM
Logs to use in testing [12 13 14 15 16 17 18 19 20 21 22 23 24]
Logs to use in learning [ 0  1  2  3  4  5  6  7  8  9 10 11]
Split id: 1
Label 0 size in the training set: 5757
Label 1 size in the training set: 434
Training....
Split id: 2
Label 0 size in the training set: 6443
Label 1 size in the training set: 537
Training....
Split id: 3
Label 0 size in the training set: 7470
Label 1 size in the training set: 549
Training....
Split id: 4
Label 0 size in the training set: 6662
Label 1 size in the training set: 552
Training....
Split id: 5
Label 0 size in the training set: 7670
Label 1 size in the training set: 571
Training....
Split id: 6
Label 0 size in the training set: 6848
Label 1 size in the training set: 549
Training....
Split id: 7
Label 0 size in the training set: 7670
Label 1 size in the training set: 571
Training....
Split id: 8
Label 0 size in the training set: 4823
Label 1 size in the training set: 424
Training....
Split id: 9
Label 0 size in the training set: 5353
Label 1 size in the training set: 433
Training....
Split id: 10
Label 0 size in the training set: 4969
Label 1 size in the training set: 441
Training....
   SplitNum  F1_train  Prec_train  Recall_train  F1_valid  Prec_valid  \
0       1.0  0.666667    0.500000           1.0  0.666667    0.500000   
1       2.0  0.666667    0.500000           1.0  0.666667    0.500000   
2       3.0  0.930233    0.869565           1.0  0.769231    0.625000   
3       4.0  0.666667    0.500000           1.0  0.666667    0.500000   
4       5.0  0.666667    0.500000           1.0  0.666667    0.500000   
5       6.0  0.909091    0.833333           1.0  0.769231    0.625000   
6       7.0  0.666667    0.500000           1.0  0.666667    0.500000   
7       8.0  0.666667    0.500000           1.0  0.666667    0.500000   
8       9.0  0.923077    0.857143           1.0  0.928571    0.866667   
9      10.0  0.666667    0.500000           1.0  0.666667    0.500000   

   Recall_valid   F1_test  Prec_test  Recall_test  Silhouette  PredictTime  \
0           1.0  0.634146   0.481481     0.928571    0.428901        0.003   
1           1.0  0.634146   0.481481     0.928571    0.243311        0.006   
2           1.0  0.806202   0.712329     0.928571    0.171567        0.004   
3           1.0  0.634146   0.481481     0.928571    0.157388        0.003   
4           1.0  0.666667   0.500000     1.000000    0.166738        0.003   
5           1.0  0.784314   0.645161     1.000000    0.342889        0.003   
6           1.0  0.666667   0.500000     1.000000    0.144772        0.007   
7           1.0  0.634146   0.481481     0.928571    0.270894        0.003   
8           1.0  0.833333   0.755814     0.928571    0.322939        0.003   
9           1.0  0.600000   0.461538     0.857143    0.346540        0.003   

   LearningTime           AIC           BIC  
0         0.131  40660.410007  40936.374935  
1         0.033  54101.974935  54382.857907  
2         0.025  63212.129231  63498.701560  
3         0.027  55583.034437  55865.269370  
4         0.141  64790.907724  65078.599680  
5         0.056  57772.086835  58055.348857  
6         0.110  64790.907724  65078.599680  
7         0.023  37888.389878  38157.571760  
8         0.042  40219.221967  40492.413023  
9         0.038  37863.420573  38133.856752  
In [42]:
print r1.describe()
print r1
       SplitNum   F1_train  Prec_train  Recall_train   F1_valid  Prec_valid  \
count  10.00000  10.000000   10.000000          10.0  10.000000   10.000000   
mean    5.50000   0.814951    0.714670           1.0   0.796667    0.685065   
std     3.02765   0.156566    0.226892           0.0   0.145254    0.212370   
min     1.00000   0.666667    0.500000           1.0   0.666667    0.500000   
25%     3.25000   0.666667    0.500000           1.0   0.666667    0.500000   
50%     5.50000   0.803922    0.694444           1.0   0.750000    0.607143   
75%     7.75000   0.967273    0.936652           1.0   0.900000    0.818182   
max    10.00000   0.975610    0.952381           1.0   1.000000    1.000000   

       Recall_valid    F1_test  Prec_test  Recall_test  Silhouette  \
count          10.0  10.000000  10.000000    10.000000   10.000000   
mean            1.0   0.763618   0.686791     0.921429    0.277914   
std             0.0   0.109532   0.200990     0.071031    0.081644   
min             1.0   0.634146   0.481481     0.857143    0.163659   
25%             1.0   0.666667   0.500000     0.857143    0.205319   
50%             1.0   0.765766   0.686364     0.892857    0.295745   
75%             1.0   0.864865   0.872727     1.000000    0.349287   
max             1.0   0.875912   0.895522     1.000000    0.376418   

       PredictTime  LearningTime   AIC   BIC  
count    10.000000      10.00000  10.0  10.0  
mean      0.003100       0.07210   0.0   0.0  
std       0.000568       0.02048   0.0   0.0  
min       0.002000       0.03900   0.0   0.0  
25%       0.003000       0.05900   0.0   0.0  
50%       0.003000       0.07000   0.0   0.0  
75%       0.003000       0.08575   0.0   0.0  
max       0.004000       0.11000   0.0   0.0  
   SplitNum  F1_train  Prec_train  Recall_train  F1_valid  Prec_valid  \
0       1.0  0.666667    0.500000           1.0  0.666667    0.500000   
1       2.0  0.941176    0.888889           1.0  1.000000    1.000000   
2       3.0  0.975610    0.952381           1.0  0.833333    0.714286   
3       4.0  0.666667    0.500000           1.0  0.666667    0.500000   
4       5.0  0.969697    0.941176           1.0  0.900000    0.818182   
5       6.0  0.666667    0.500000           1.0  0.666667    0.500000   
6       7.0  0.969697    0.941176           1.0  0.900000    0.818182   
7       8.0  0.666667    0.500000           1.0  0.666667    0.500000   
8       9.0  0.960000    0.923077           1.0  1.000000    1.000000   
9      10.0  0.666667    0.500000           1.0  0.666667    0.500000   

   Recall_valid   F1_test  Prec_test  Recall_test  Silhouette  PredictTime  \
0           1.0  0.666667   0.500000     1.000000    0.361260        0.003   
1           1.0  0.864865   0.872727     0.857143    0.245249        0.003   
2           1.0  0.864865   0.872727     0.857143    0.192009        0.003   
3           1.0  0.634146   0.481481     0.928571    0.168879        0.003   
4           1.0  0.864865   0.872727     0.857143    0.163659        0.003   
5           1.0  0.666667   0.500000     1.000000    0.313369        0.003   
6           1.0  0.864865   0.872727     0.857143    0.286844        0.004   
7           1.0  0.666667   0.500000     1.000000    0.304645        0.002   
8           1.0  0.875912   0.895522     0.857143    0.366807        0.003   
9           1.0  0.666667   0.500000     1.000000    0.376418        0.004   

   LearningTime  AIC  BIC  
0         0.110  0.0  0.0  
1         0.067  0.0  0.0  
2         0.073  0.0  0.0  
3         0.065  0.0  0.0  
4         0.091  0.0  0.0  
5         0.057  0.0  0.0  
6         0.090  0.0  0.0  
7         0.039  0.0  0.0  
8         0.056  0.0  0.0  
9         0.073  0.0  0.0  
In [43]:
print r2.describe()
print r2
       SplitNum   F1_train  Prec_train  Recall_train   F1_valid  Prec_valid  \
count  10.00000  10.000000   10.000000          10.0  10.000000   10.000000   
mean    5.50000   0.742907    0.606004           1.0   0.713370    0.561667   
std     3.02765   0.122863    0.170903           0.0   0.086808    0.119102   
min     1.00000   0.666667    0.500000           1.0   0.666667    0.500000   
25%     3.25000   0.666667    0.500000           1.0   0.666667    0.500000   
50%     5.50000   0.666667    0.500000           1.0   0.666667    0.500000   
75%     7.75000   0.848485    0.750000           1.0   0.743590    0.593750   
max    10.00000   0.930233    0.869565           1.0   0.928571    0.866667   

       Recall_valid    F1_test  Prec_test  Recall_test  Silhouette  \
count          10.0  10.000000  10.000000    10.000000   10.000000   
mean            1.0   0.689377   0.550077     0.942857    0.259594   
std             0.0   0.084726   0.110237     0.045175    0.098640   
min             1.0   0.600000   0.461538     0.857143    0.144772   
25%             1.0   0.634146   0.481481     0.928571    0.167945   
50%             1.0   0.650407   0.490741     0.928571    0.257102   
75%             1.0   0.754902   0.608871     0.982143    0.337902   
max             1.0   0.833333   0.755814     1.000000    0.428901   

       PredictTime  LearningTime           AIC           BIC  
count    10.000000     10.000000     10.000000     10.000000  
mean      0.003800      0.062600  51688.248331  51967.959352  
std       0.001476      0.046273  11396.302826  11403.245568  
min       0.003000      0.023000  37863.420573  38133.856752  
25%       0.003000      0.028500  40329.518977  40603.403501  
50%       0.003000      0.040000  54842.504686  55124.063638  
75%       0.003750      0.096500  61852.118632  62137.863384  
max       0.007000      0.141000  64790.907724  65078.599680  
   SplitNum  F1_train  Prec_train  Recall_train  F1_valid  Prec_valid  \
0       1.0  0.666667    0.500000           1.0  0.666667    0.500000   
1       2.0  0.666667    0.500000           1.0  0.666667    0.500000   
2       3.0  0.930233    0.869565           1.0  0.769231    0.625000   
3       4.0  0.666667    0.500000           1.0  0.666667    0.500000   
4       5.0  0.666667    0.500000           1.0  0.666667    0.500000   
5       6.0  0.909091    0.833333           1.0  0.769231    0.625000   
6       7.0  0.666667    0.500000           1.0  0.666667    0.500000   
7       8.0  0.666667    0.500000           1.0  0.666667    0.500000   
8       9.0  0.923077    0.857143           1.0  0.928571    0.866667   
9      10.0  0.666667    0.500000           1.0  0.666667    0.500000   

   Recall_valid   F1_test  Prec_test  Recall_test  Silhouette  PredictTime  \
0           1.0  0.634146   0.481481     0.928571    0.428901        0.003   
1           1.0  0.634146   0.481481     0.928571    0.243311        0.006   
2           1.0  0.806202   0.712329     0.928571    0.171567        0.004   
3           1.0  0.634146   0.481481     0.928571    0.157388        0.003   
4           1.0  0.666667   0.500000     1.000000    0.166738        0.003   
5           1.0  0.784314   0.645161     1.000000    0.342889        0.003   
6           1.0  0.666667   0.500000     1.000000    0.144772        0.007   
7           1.0  0.634146   0.481481     0.928571    0.270894        0.003   
8           1.0  0.833333   0.755814     0.928571    0.322939        0.003   
9           1.0  0.600000   0.461538     0.857143    0.346540        0.003   

   LearningTime           AIC           BIC  
0         0.131  40660.410007  40936.374935  
1         0.033  54101.974935  54382.857907  
2         0.025  63212.129231  63498.701560  
3         0.027  55583.034437  55865.269370  
4         0.141  64790.907724  65078.599680  
5         0.056  57772.086835  58055.348857  
6         0.110  64790.907724  65078.599680  
7         0.023  37888.389878  38157.571760  
8         0.042  40219.221967  40492.413023  
9         0.038  37863.420573  38133.856752  
In [44]:
def show_cluster_results(fa_data, preds, num_clusters, dimensions, true_data=None, title="K-Means"):
    # visualize clustering results
   
    components_names = ['Component {}'.format(i) for i in range(1, fa_data.shape[1] + 1)]
    fa_data = pd.DataFrame(fa_data, columns=components_names)
   
    if true_data is not None:
        true_data = pd.DataFrame(true_data, columns=components_names)
   

    predictions = pd.DataFrame(preds, columns = ['Cluster'])   
    plot_data = pd.concat([predictions, fa_data], axis = 1)


    # Set color map
    cmap = cm.get_cmap('gist_rainbow')

    for dims in dimensions: # plot for particular dim pairs
       
        # Generate the cluster plot
        cluster_A = 'Component {}'.format(dims[0])
        cluster_B = 'Component {}'.format(dims[1])
       
        fig, ax  = plt.subplots(figsize = (14,8))
       
        #print plot_data.keys()
        # Color the points based on assigned cluster
        for i, cluster in plot_data.groupby('Cluster'):
            cluster.plot(ax = ax, kind = 'scatter', x = cluster_A, y = cluster_B, \
                         color = cmap((i) * 1.0 / (num_clusters - 1)), label = 'Cluster %i'%(i), s=30);
       
        # Plot clusters
#        for i, c in enumerate(range(1, num_clusters + 1)):
#            ax.scatter(x = c[fa_data.columns.get_loc(cluster_A)], \
#                       y = c[fa_data.columns.get_loc(cluster_B)], \
#                       color = cmap((i) * 1.0 / (num_clusters - 1)), edgecolors = 'black', \
#                       alpha = 1, linewidth = 2, marker = 'o', s=200);
#           
#            ax.scatter(x = c[fa_data.columns.get_loc(cluster_A)], \
#                       y = c[fa_data.columns.get_loc(cluster_B)], marker='$%d$'%(i), alpha = 1, s=100);

            
        if true_data is not None:
            true_data.plot(ax = ax, kind = 'scatter', x = cluster_A, y = cluster_B, \
                         color = 'b', label = 'True data', s=20);            
        # Set plot title
        ax.set_title("{:s} clustering on FA analysis data for N={:d} clusters".format(title, num_clusters));
In [56]:
test_log_ids = np.arange(12, 25)
only_test_data = all_log_data[(all_log_data['LOG_ID'].isin(test_log_ids)) &
                                 (all_log_data['STATUS_land'] == 1.0)].reset_index(drop=True)

only_test_data = remove_outliers_of_land_status(only_test_data)

otest_fa_data = fa_.transform(only_test_data[features2use])


true_data = only_test_data[only_test_data['LAND_STATUS'] == 1.0]
true_data = fa_.transform(true_data[features2use])

best_preds = best_f1_r1.predict(otest_fa_data[:,0:5])

show_cluster_results(otest_fa_data[:,0:5], best_preds, 2, [[1, 2]], true_data[:,0:5])
show_cluster_results(otest_fa_data[:,0:5], best_preds, 2, [[2, 3]], true_data[:,0:5])
In [57]:
true_data = only_test_data[only_test_data['LAND_STATUS'] == 1.0]
true_data = fa_.transform(true_data[features2use])

best_preds = best_f1_r2.predict(otest_fa_data[:,0:5])

show_cluster_results(otest_fa_data[:,0:5], best_preds, 2, [[1, 2]], true_data[:,0:5], title="GMM")
show_cluster_results(otest_fa_data[:,0:5], best_preds, 2, [[2, 3]], true_data[:,0:5], title="GMM")

Tune final solution

  • use best cls with highest f1 score value
  • write own classifier which use radius of the second cluser
  • measure final accuracy
In [61]:
train_index = np.arange(0, 12)
train_data = all_log_data[(all_log_data['LOG_ID'].isin(train_index)) &
                          (all_log_data['STATUS_land'] == 1.0)].reset_index(drop=True)
train_data = remove_outliers_of_land_status(train_data)
tr_fa_data = fa_.transform(train_data[features2use])

tr_fa_data_y = train_data['LAND_STATUS'].reset_index(drop=True)
params = ['full', 'tied', 'diag', 'spherical']

iters = [100, 200, 300]
finest_aic = np.inf
finest_bic = np.inf
finest_f1 = -np.inf
best_params_aic = []
best_params_bic = []
best_params_f1 = []

for iter_ in iters:
    print "Max_iter", iter_
    for param in params:
        clf_GMM = mixture.GaussianMixture(n_components=2, covariance_type=param, max_iter=iter_, tol=0.1)
        clf_GMM.fit(tr_fa_data[:,0:5],tr_fa_data_y)
        preds_otest = clf_GMM.predict(otest_fa_data[:,0:5])
        #acc = measure_accuracy([], only_test_data, preds_otest, touch_detect_period=0.8)
        acc = measure_accuracy([], only_test_data, preds_otest, touch_detect_period=0.8)
        aic = clf_GMM.aic(tr_fa_data[:,0:5])
        bic = clf_GMM.bic(tr_fa_data[:,0:5])
        print "Type = {:s}, F1_score = {:.3f}, Recall={:.3f}, Precision={:.3f}, AIC={:.3f}, BIC={:.3f}".format(param, 
                                                                                           acc['f1'], 
                                                                                           acc['recall'],
                                                                                           acc['precision'],
                                                                                            aic, bic)
        
        if acc['f1'] > finest_f1:
            finest_f1 = acc['f1']
            f1_best = clf_GMM
            best_params_f1 = [iter_, param, aic]
        
        if aic < finest_aic:
            finest_aic = aic
            aic_best = clf_GMM
            best_params_aic = [iter_, param, aic]
            
        if bic < finest_bic:
            finest_bic = bic
            bic_best = clf_GMM
            best_params_bic = [iter_, param, bic]    
            
print "Best params for AIC", best_params_aic
print "Best params for BIC", best_params_bic
print "Best params for F1", best_params_f1
Max_iter 100
Type = full, F1_score = 0.824, Recall=0.929, Precision=0.741, AIC=91489.180, BIC=91790.435
Type = tied, F1_score = 0.667, Recall=1.000, Precision=0.500, AIC=106892.993, BIC=107084.033
Type = diag, F1_score = 0.808, Recall=1.000, Precision=0.678, AIC=98359.372, BIC=98513.673
Type = spherical, F1_score = 0.714, Recall=1.000, Precision=0.556, AIC=104335.005, BIC=104430.524
Max_iter 200
Type = full, F1_score = 0.824, Recall=0.929, Precision=0.741, AIC=91496.450, BIC=91797.704
Type = tied, F1_score = 0.941, Recall=1.000, Precision=0.889, AIC=106892.993, BIC=107084.033
Type = diag, F1_score = 0.600, Recall=0.857, Precision=0.462, AIC=98359.372, BIC=98513.673
Type = spherical, F1_score = 0.714, Recall=1.000, Precision=0.556, AIC=104336.177, BIC=104431.697
Max_iter 300
Type = full, F1_score = 0.824, Recall=0.929, Precision=0.741, AIC=91496.450, BIC=91797.704
Type = tied, F1_score = 0.941, Recall=1.000, Precision=0.889, AIC=106892.993, BIC=107084.033
Type = diag, F1_score = 0.808, Recall=1.000, Precision=0.678, AIC=98359.372, BIC=98513.673
Type = spherical, F1_score = 0.714, Recall=1.000, Precision=0.556, AIC=104336.177, BIC=104431.697
Best params for AIC [100, 'full', 91489.180489317034]
Best params for BIC [100, 'full', 91790.43473639345]
Best params for F1 [200, 'tied', 106892.99325934482]
In [62]:
true_data = only_test_data[only_test_data['LAND_STATUS'] == 1.0]
true_data = fa_.transform(true_data[features2use])

clf_GMM = mixture.GaussianMixture(n_components=2, covariance_type='tied', max_iter=300, tol=0.1)
clf_GMM.fit(tr_fa_data[:,0:5])
preds_final = clf_GMM.predict(otest_fa_data[:,0:5])
show_cluster_results(otest_fa_data[:,0:5], preds, 2, [[1, 2]], true_data=true_data[:,0:5], title='GMM')
In [63]:
clf_GMM = mixture.GaussianMixture(n_components=2, max_iter=200, tol=0.1, covariance_type='tied')
best_f1_final,_,r2_final = clusterize(clf_GMM, title="GMM")
Using: GMM
Logs to use in testing [12 13 14 15 16 17 18 19 20 21 22 23 24]
Logs to use in learning [ 0  1  2  3  4  5  6  7  8  9 10 11]
Split id: 1
Label 0 size in the training set: 5757
Label 1 size in the training set: 434
Training....
Split id: 2
Label 0 size in the training set: 6443
Label 1 size in the training set: 537
Training....
Split id: 3
Label 0 size in the training set: 7470
Label 1 size in the training set: 549
Training....
Split id: 4
Label 0 size in the training set: 6662
Label 1 size in the training set: 552
Training....
Split id: 5
Label 0 size in the training set: 7670
Label 1 size in the training set: 571
Training....
Split id: 6
Label 0 size in the training set: 6848
Label 1 size in the training set: 549
Training....
Split id: 7
Label 0 size in the training set: 7670
Label 1 size in the training set: 571
Training....
Split id: 8
Label 0 size in the training set: 4823
Label 1 size in the training set: 424
Training....
Split id: 9
Label 0 size in the training set: 5353
Label 1 size in the training set: 433
Training....
Split id: 10
Label 0 size in the training set: 4969
Label 1 size in the training set: 441
Training....
In [64]:
print r2_final
print r2_final.describe()
   SplitNum  F1_train  Prec_train  Recall_train  F1_valid  Prec_valid  \
0       1.0  0.901408    0.914286      0.888889  0.821053    0.906977   
1       2.0  0.666667    0.500000      1.000000  0.666667    0.500000   
2       3.0  0.975610    0.952381      1.000000  0.909091    0.833333   
3       4.0  0.885246    0.870968      0.900000  0.869565    0.769231   
4       5.0  0.969697    0.941176      1.000000  0.857143    0.882353   
5       6.0  0.932203    0.948276      0.916667  0.909091    0.833333   
6       7.0  0.969697    0.941176      1.000000  0.857143    0.882353   
7       8.0  0.933333    0.875000      1.000000  0.920455    0.941860   
8       9.0  0.864865    0.842105      0.888889  0.857143    1.000000   
9      10.0  0.888889    0.800000      1.000000  0.941176    1.000000   

   Recall_valid   F1_test  Prec_test  Recall_test  Silhouette  PredictTime  \
0      0.750000  0.926916   0.925267     0.928571    0.460110        0.003   
1      1.000000  0.666667   0.500000     1.000000    0.221131        0.003   
2      1.000000  0.904348   0.881356     0.928571    0.207283        0.002   
3      1.000000  0.680062   0.599455     0.785714    0.255798        0.003   
4      0.833333  0.904348   0.881356     0.928571    0.185358        0.004   
5      1.000000  0.904348   0.881356     0.928571    0.261995        0.008   
6      0.833333  0.904348   0.881356     0.928571    0.186439        0.002   
7      0.900000  0.904348   0.881356     0.928571    0.299475        0.002   
8      0.750000  0.926916   0.925267     0.928571    0.305086        0.003   
9      0.888889  0.904348   0.881356     0.928571    0.406755        0.003   

   LearningTime           AIC           BIC  
0         0.015  51413.657292  51588.659442  
1         0.016  62658.001564  62836.122473  
2         0.014  69710.117793  69891.846587  
3         0.020  70526.871298  70705.849548  
4         0.015  73221.049552  73403.488354  
5         0.031  71166.236994  71345.866568  
6         0.027  73221.049552  73403.488354  
7         0.011  47334.450031  47505.150737  
8         0.014  49282.846496  49456.089605  
9         0.012  48379.640902  48551.137016  
       SplitNum   F1_train  Prec_train  Recall_train   F1_valid  Prec_valid  \
count  10.00000  10.000000   10.000000     10.000000  10.000000   10.000000   
mean    5.50000   0.898762    0.858537      0.959444   0.860853    0.854944   
std     3.02765   0.090423    0.135932      0.052903   0.077508    0.144573   
min     1.00000   0.666667    0.500000      0.888889   0.666667    0.500000   
25%     3.25000   0.886157    0.849321      0.904167   0.857143    0.833333   
50%     5.50000   0.916806    0.894643      1.000000   0.863354    0.882353   
75%     7.75000   0.960606    0.941176      1.000000   0.909091    0.933140   
max    10.00000   0.975610    0.952381      1.000000   0.941176    1.000000   

       Recall_valid    F1_test  Prec_test  Recall_test  Silhouette  \
count     10.000000  10.000000  10.000000    10.000000   10.000000   
mean       0.895556   0.862665   0.823812     0.921429    0.278943   
std        0.102097   0.100244   0.147439     0.052705    0.092423   
min        0.750000   0.666667   0.500000     0.785714    0.185358   
25%        0.833333   0.904348   0.881356     0.928571    0.210745   
50%        0.894444   0.904348   0.881356     0.928571    0.258897   
75%        1.000000   0.904348   0.881356     0.928571    0.303683   
max        1.000000   0.926916   0.925267     1.000000    0.460110   

       PredictTime  LearningTime           AIC           BIC  
count    10.000000     10.000000     10.000000     10.000000  
mean      0.003300      0.017500  61691.392147  61868.769868  
std       0.001767      0.006587  11263.840958  11268.158345  
min       0.002000      0.011000  47334.450031  47505.150737  
25%       0.002250      0.014000  49815.549195  49989.232064  
50%       0.003000      0.015000  66184.059679  66363.984530  
75%       0.003000      0.019000  71006.395570  71185.862313  
max       0.008000      0.031000  73221.049552  73403.488354  
In [53]:
def visualize_prediction(dataset, preds):
    # visualize prediction of clustering
    log_ids = dataset['LOG_ID'].unique()
    logs_data = dataset.copy()
    logs_data.reset_index(drop=True, inplace=True)
    predictions = pd.DataFrame(preds, columns = ['Cluster'])
    predictions.reset_index(drop=True, inplace=True)
    
    logs_data = pd.concat([logs_data, predictions], axis=1)
    #print logs_data.columns
    
    for logid in log_ids:
        log_data = logs_data[logs_data['LOG_ID'] == logid]
        #print log_data.describe()
        #print 
        
        log_data['Cluster'] = log_data['Cluster'] + 0.02 # shift by plus one
        log_data['TIME'] = log_data['TIME'] / 1.0e+6 # convert to seconds
        time_intervals = get_data_intervals(list(log_data['TIME']))

        #print time_intervals, logid
        for time_start, time_stop in time_intervals:
            interval_data = log_data[(log_data['TIME'] >= time_start) & 
                                         (log_data['TIME'] <= time_stop)].reset_index(drop=True)
        
            x_ticks = interval_data['TIME']
            try:
                x_ticks_indexes = range(0, len(x_ticks), int(len(x_ticks) / (6 * 8) + 0.5))
            except:
                continue 
            x_ticks = list(np.round(x_ticks[x_ticks_indexes].reset_index(drop=True), 2))

            fig, ax = plt.subplots(figsize = (14,2))
            interval_data.plot(y = ['LAND_STATUS', 'Cluster'], kind = 'line', x = ['TIME'],
                              ax=ax, rot=70, xticks=x_ticks, grid = True)
            ax.set_title("LogID={:d}, Time inteval {:.2f}-{:.2f}".format(logid, time_start, time_stop))
            ax.set_ylim([-0.02,2])
In [65]:
visualize_prediction(only_test_data, preds_final)
C:\Users\sholc2005\Anaconda2\lib\site-packages\ipykernel\__main__.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\sholc2005\Anaconda2\lib\site-packages\ipykernel\__main__.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
In [ ]: